1   package org.apache.lucene.index;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.ByteArrayOutputStream;
21  import java.io.IOException;
22  import java.io.PrintStream;
23  import java.util.Arrays;
24  import java.util.Collection;
25  import java.util.Collections;
26  import java.util.HashMap;
27  import java.util.HashSet;
28  import java.util.IdentityHashMap;
29  import java.util.List;
30  import java.util.Map;
31  import java.util.Set;
32  
33  import org.apache.lucene.analysis.Analyzer;
34  import org.apache.lucene.analysis.MockAnalyzer;
35  import org.apache.lucene.codecs.Codec;
36  import org.apache.lucene.codecs.DocValuesConsumer;
37  import org.apache.lucene.codecs.DocValuesProducer;
38  import org.apache.lucene.codecs.FieldsConsumer;
39  import org.apache.lucene.codecs.FieldsProducer;
40  import org.apache.lucene.codecs.NormsConsumer;
41  import org.apache.lucene.codecs.NormsProducer;
42  import org.apache.lucene.codecs.StoredFieldsReader;
43  import org.apache.lucene.codecs.StoredFieldsWriter;
44  import org.apache.lucene.codecs.TermVectorsReader;
45  import org.apache.lucene.codecs.TermVectorsWriter;
46  import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
47  import org.apache.lucene.document.Document;
48  import org.apache.lucene.document.Field;
49  import org.apache.lucene.document.FieldType;
50  import org.apache.lucene.document.NumericDocValuesField;
51  import org.apache.lucene.document.TextField;
52  import org.apache.lucene.store.AlreadyClosedException;
53  import org.apache.lucene.store.Directory;
54  import org.apache.lucene.store.FlushInfo;
55  import org.apache.lucene.store.IOContext;
56  import org.apache.lucene.store.IndexInput;
57  import org.apache.lucene.store.MockDirectoryWrapper;
58  import org.apache.lucene.util.BytesRef;
59  import org.apache.lucene.util.CloseableThreadLocal;
60  import org.apache.lucene.util.IOUtils;
61  import org.apache.lucene.util.InfoStream;
62  import org.apache.lucene.util.LuceneTestCase;
63  import org.apache.lucene.util.RamUsageEstimator;
64  import org.apache.lucene.util.RamUsageTester;
65  import org.apache.lucene.util.Rethrow;
66  import org.apache.lucene.util.StringHelper;
67  import org.apache.lucene.util.TestUtil;
68  import org.apache.lucene.util.Version;
69  
70  /**
71   * Common tests to all index formats.
72   */
73  abstract class BaseIndexFileFormatTestCase extends LuceneTestCase {
74  
75    // metadata or Directory-level objects
76    private static final Set<Class<?>> EXCLUDED_CLASSES = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
77  
78    static {
79      // Directory objects, don't take into account eg. the NIO buffers
80      EXCLUDED_CLASSES.add(Directory.class);
81      EXCLUDED_CLASSES.add(IndexInput.class);
82  
83      // used for thread management, not by the index
84      EXCLUDED_CLASSES.add(CloseableThreadLocal.class);
85      EXCLUDED_CLASSES.add(ThreadLocal.class);
86  
87      // don't follow references to the top-level reader
88      EXCLUDED_CLASSES.add(IndexReader.class);
89      EXCLUDED_CLASSES.add(IndexReaderContext.class);
90  
91      // usually small but can bump memory usage for
92      // memory-efficient things like stored fields
93      EXCLUDED_CLASSES.add(FieldInfos.class);
94      EXCLUDED_CLASSES.add(SegmentInfo.class);
95      EXCLUDED_CLASSES.add(SegmentCommitInfo.class);
96      EXCLUDED_CLASSES.add(FieldInfo.class);
97  
98      // constant overhead is typically due to strings
99      // TODO: can we remove this and still pass the test consistently
100     EXCLUDED_CLASSES.add(String.class);
101   }
102 
103   static class Accumulator extends RamUsageTester.Accumulator {
104 
105     private final Object root;
106 
107     Accumulator(Object root) {
108       this.root = root;
109     }
110 
111     public long accumulateObject(Object o, long shallowSize, Map<java.lang.reflect.Field, Object> fieldValues, Collection<Object> queue) {
112       for (Class<?> clazz = o.getClass(); clazz != null; clazz = clazz.getSuperclass()) {
113         if (EXCLUDED_CLASSES.contains(clazz) && o != root) {
114           return 0;
115         }
116       }
117       // we have no way to estimate the size of these things in codecs although
118       // something like a Collections.newSetFromMap(new HashMap<>()) uses quite
119       // some memory... So for now the test ignores the overhead of such
120       // collections but can we do better?
121       long v;
122       if (o instanceof Collection) {
123         Collection<?> coll = (Collection<?>) o;
124         queue.addAll((Collection<?>) o);
125         v = (long) coll.size() * RamUsageEstimator.NUM_BYTES_OBJECT_REF;
126       } else if (o instanceof Map) {
127         final Map<?, ?> map = (Map<?,?>) o;
128         queue.addAll(map.keySet());
129         queue.addAll(map.values());
130         v = 2L * map.size() * RamUsageEstimator.NUM_BYTES_OBJECT_REF;
131       } else {
132         v = super.accumulateObject(o, shallowSize, fieldValues, queue);
133       }
134       // System.out.println(o.getClass() + "=" + v);
135       return v;
136     }
137 
138     @Override
139     public long accumulateArray(Object array, long shallowSize,
140         List<Object> values, Collection<Object> queue) {
141       long v = super.accumulateArray(array, shallowSize, values, queue);
142       // System.out.println(array.getClass() + "=" + v);
143       return v;
144     }
145 
146   };
147 
148   /** Returns the codec to run tests against */
149   protected abstract Codec getCodec();
150 
151   private Codec savedCodec;
152 
153   public void setUp() throws Exception {
154     super.setUp();
155     // set the default codec, so adding test cases to this isn't fragile
156     savedCodec = Codec.getDefault();
157     Codec.setDefault(getCodec());
158   }
159 
160   public void tearDown() throws Exception {
161     Codec.setDefault(savedCodec); // restore
162     super.tearDown();
163   }
164 
165   /** Add random fields to the provided document. */
166   protected abstract void addRandomFields(Document doc);
167 
168   private Map<String, Long> bytesUsedByExtension(Directory d) throws IOException {
169     Map<String, Long> bytesUsedByExtension = new HashMap<>();
170     for (String file : d.listAll()) {
171       if (IndexFileNames.CODEC_FILE_PATTERN.matcher(file).matches()) {
172         final String ext = IndexFileNames.getExtension(file);
173         final long previousLength = bytesUsedByExtension.containsKey(ext) ? bytesUsedByExtension.get(ext) : 0;
174         bytesUsedByExtension.put(ext, previousLength + d.fileLength(file));
175       }
176     }
177     bytesUsedByExtension.keySet().removeAll(excludedExtensionsFromByteCounts());
178 
179     return bytesUsedByExtension;
180   }
181 
182   /**
183    * Return the list of extensions that should be excluded from byte counts when
184    * comparing indices that store the same content.
185    */
186   protected Collection<String> excludedExtensionsFromByteCounts() {
187     return new HashSet<String>(Arrays.asList(new String[] {
188     // segment infos store various pieces of information that don't solely depend
189     // on the content of the index in the diagnostics (such as a timestamp) so we
190     // exclude this file from the bytes counts
191                         "si",
192     // lock files are 0 bytes (one directory in the test could be RAMDir, the other FSDir)
193                         "lock" }));
194   }
195 
196   /** The purpose of this test is to make sure that bulk merge doesn't accumulate useless data over runs. */
197   public void testMergeStability() throws Exception {
198     Directory dir = newDirectory();
199     if (dir instanceof MockDirectoryWrapper) {
200       // Else, the virus checker may prevent deletion of files and cause
201       // us to see too many bytes used by extension in the end:
202       ((MockDirectoryWrapper) dir).setEnableVirusScanner(false);
203     }
204     // do not use newMergePolicy that might return a MockMergePolicy that ignores the no-CFS ratio
205     // do not use RIW which will change things up!
206     MergePolicy mp = newTieredMergePolicy();
207     mp.setNoCFSRatio(0);
208     IndexWriterConfig cfg = new IndexWriterConfig(new MockAnalyzer(random())).setUseCompoundFile(false).setMergePolicy(mp);
209     IndexWriter w = new IndexWriter(dir, cfg);
210     final int numDocs = atLeast(500);
211     for (int i = 0; i < numDocs; ++i) {
212       Document d = new Document();
213       addRandomFields(d);
214       w.addDocument(d);
215     }
216     w.forceMerge(1);
217     w.commit();
218     w.close();
219     DirectoryReader reader = DirectoryReader.open(dir);
220 
221     Directory dir2 = newDirectory();
222     if (dir2 instanceof MockDirectoryWrapper) {
223       // Else, the virus checker may prevent deletion of files and cause
224       // us to see too many bytes used by extension in the end:
225       ((MockDirectoryWrapper) dir2).setEnableVirusScanner(false);
226     }
227     mp = newTieredMergePolicy();
228     mp.setNoCFSRatio(0);
229     cfg = new IndexWriterConfig(new MockAnalyzer(random())).setUseCompoundFile(false).setMergePolicy(mp);
230     w = new IndexWriter(dir2, cfg);
231     TestUtil.addIndexesSlowly(w, reader);
232 
233     w.commit();
234     w.close();
235 
236     assertEquals(bytesUsedByExtension(dir), bytesUsedByExtension(dir2));
237 
238     reader.close();
239     dir.close();
240     dir2.close();
241   }
242 
243   /** Test the accuracy of the ramBytesUsed estimations. */
244   @Slow
245   public void testRamBytesUsed() throws IOException {
246     if (Codec.getDefault() instanceof RandomCodec) {
247       // this test relies on the fact that two segments will be written with
248       // the same codec so we need to disable MockRandomPF
249       final Set<String> avoidCodecs = new HashSet<>(((RandomCodec) Codec.getDefault()).avoidCodecs);
250       avoidCodecs.add(new MockRandomPostingsFormat().getName());
251       Codec.setDefault(new RandomCodec(random(), avoidCodecs));
252     }
253     Directory dir = newDirectory();
254     IndexWriterConfig cfg = newIndexWriterConfig(new MockAnalyzer(random()));
255     IndexWriter w = new IndexWriter(dir, cfg);
256     // we need to index enough documents so that constant overhead doesn't dominate
257     final int numDocs = atLeast(10000);
258     LeafReader reader1 = null;
259     for (int i = 0; i < numDocs; ++i) {
260       Document d = new Document();
261       addRandomFields(d);
262       w.addDocument(d);
263       if (i == 100) {
264         w.forceMerge(1);
265         w.commit();
266         reader1 = getOnlySegmentReader(DirectoryReader.open(dir));
267       }
268     }
269     w.forceMerge(1);
270     w.commit();
271     w.close();
272 
273     LeafReader reader2 = getOnlySegmentReader(DirectoryReader.open(dir));
274 
275     for (LeafReader reader : Arrays.asList(reader1, reader2)) {
276       new SimpleMergedSegmentWarmer(InfoStream.NO_OUTPUT).warm(reader);
277     }
278 
279     final long actualBytes = RamUsageTester.sizeOf(reader2, new Accumulator(reader2)) - RamUsageTester.sizeOf(reader1, new Accumulator(reader1));
280     final long expectedBytes = ((SegmentReader) reader2).ramBytesUsed() - ((SegmentReader) reader1).ramBytesUsed();
281     final long absoluteError = actualBytes - expectedBytes;
282     final double relativeError = (double) absoluteError / actualBytes;
283     final String message = "Actual RAM usage " + actualBytes + ", but got " + expectedBytes + ", " + 100*relativeError + "% error";
284     assertTrue(message, Math.abs(relativeError) < 0.20d || Math.abs(absoluteError) < 1000);
285 
286     reader1.close();
287     reader2.close();
288     dir.close();
289   }
290   
291   /** Calls close multiple times on closeable codec apis */
292   public void testMultiClose() throws IOException {
293     // first make a one doc index
294     Directory oneDocIndex = newDirectory();
295     IndexWriter iw = new IndexWriter(oneDocIndex, new IndexWriterConfig(new MockAnalyzer(random())));
296     Document oneDoc = new Document();
297     FieldType customType = new FieldType(TextField.TYPE_STORED);
298     customType.setStoreTermVectors(true);
299     Field customField = new Field("field", "contents", customType);
300     oneDoc.add(customField);
301     oneDoc.add(new NumericDocValuesField("field", 5));
302     iw.addDocument(oneDoc);
303     LeafReader oneDocReader = getOnlySegmentReader(DirectoryReader.open(iw, true));
304     iw.close();
305     
306     // now feed to codec apis manually
307     // we use FSDir, things like ramdir are not guaranteed to cause fails if you write to them after close(), etc
308     Directory dir = newFSDirectory(createTempDir("justSoYouGetSomeChannelErrors"));
309     Codec codec = getCodec();
310     
311     SegmentInfo segmentInfo = new SegmentInfo(dir, Version.LATEST, "_0", 1, false, codec, Collections.<String,String>emptyMap(), StringHelper.randomId(), new HashMap<String,String>());
312     FieldInfo proto = oneDocReader.getFieldInfos().fieldInfo("field");
313     FieldInfo field = new FieldInfo(proto.name, proto.number, proto.hasVectors(), proto.omitsNorms(), proto.hasPayloads(), 
314                                     proto.getIndexOptions(), proto.getDocValuesType(), proto.getDocValuesGen(), new HashMap<String,String>());
315 
316     FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] { field } );
317 
318     SegmentWriteState writeState = new SegmentWriteState(null, dir,
319                                                          segmentInfo, fieldInfos,
320                                                          null, new IOContext(new FlushInfo(1, 20)));
321     
322     SegmentReadState readState = new SegmentReadState(dir, segmentInfo, fieldInfos, IOContext.READ);
323 
324     // PostingsFormat
325     try (FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(writeState)) {
326       consumer.write(oneDocReader.fields());
327       IOUtils.close(consumer);
328       IOUtils.close(consumer);
329     }
330     try (FieldsProducer producer = codec.postingsFormat().fieldsProducer(readState)) {
331       IOUtils.close(producer);
332       IOUtils.close(producer);
333     }
334     
335     // DocValuesFormat
336     try (DocValuesConsumer consumer = codec.docValuesFormat().fieldsConsumer(writeState)) {
337       consumer.addNumericField(field, Collections.<Number>singleton(5));
338       IOUtils.close(consumer);
339       IOUtils.close(consumer);
340     }
341     try (DocValuesProducer producer = codec.docValuesFormat().fieldsProducer(readState)) {
342       IOUtils.close(producer);
343       IOUtils.close(producer);
344     }
345     
346     // NormsFormat
347     try (NormsConsumer consumer = codec.normsFormat().normsConsumer(writeState)) {
348       consumer.addNormsField(field, Collections.<Number>singleton(5));
349       IOUtils.close(consumer);
350       IOUtils.close(consumer);
351     }
352     try (NormsProducer producer = codec.normsFormat().normsProducer(readState)) {
353       IOUtils.close(producer);
354       IOUtils.close(producer);
355     }
356     
357     // TermVectorsFormat
358     try (TermVectorsWriter consumer = codec.termVectorsFormat().vectorsWriter(dir, segmentInfo, writeState.context)) {
359       consumer.startDocument(1);
360       consumer.startField(field, 1, false, false, false);
361       consumer.startTerm(new BytesRef("testing"), 2);
362       consumer.finishTerm();
363       consumer.finishField();
364       consumer.finishDocument();
365       consumer.finish(fieldInfos, 1);
366       IOUtils.close(consumer);
367       IOUtils.close(consumer);
368     }
369     try (TermVectorsReader producer = codec.termVectorsFormat().vectorsReader(dir, segmentInfo, fieldInfos, readState.context)) {
370       IOUtils.close(producer);
371       IOUtils.close(producer);
372     }
373     
374     // StoredFieldsFormat
375     try (StoredFieldsWriter consumer = codec.storedFieldsFormat().fieldsWriter(dir, segmentInfo, writeState.context)) {
376       consumer.startDocument();
377       consumer.writeField(field, customField);
378       consumer.finishDocument();
379       consumer.finish(fieldInfos, 1);
380       IOUtils.close(consumer);
381       IOUtils.close(consumer);
382     }
383     try (StoredFieldsReader producer = codec.storedFieldsFormat().fieldsReader(dir, segmentInfo, fieldInfos, readState.context)) {
384       IOUtils.close(producer);
385       IOUtils.close(producer);
386     }
387             
388     IOUtils.close(oneDocReader, oneDocIndex, dir);
389   }
390   
391   /** Tests exception handling on write and openInput/createOutput */
392   // TODO: this is really not ideal. each BaseXXXTestCase should have unit tests doing this.
393   // but we use this shotgun approach to prevent bugs in the meantime: it just ensures the
394   // codec does not corrupt the index or leak file handles.
395   public void testRandomExceptions() throws Exception {
396     // disable slow things: we don't rely upon sleeps here.
397     MockDirectoryWrapper dir = newMockDirectory();
398     dir.setThrottling(MockDirectoryWrapper.Throttling.NEVER);
399     dir.setUseSlowOpenClosers(false);
400     dir.setPreventDoubleWrite(false);
401     dir.setRandomIOExceptionRate(0.001); // more rare
402     
403     // log all exceptions we hit, in case we fail (for debugging)
404     ByteArrayOutputStream exceptionLog = new ByteArrayOutputStream();
405     PrintStream exceptionStream = new PrintStream(exceptionLog, true, "UTF-8");
406     //PrintStream exceptionStream = System.out;
407     
408     Analyzer analyzer = new MockAnalyzer(random());
409     
410     IndexWriterConfig conf = newIndexWriterConfig(analyzer);
411     // just for now, try to keep this test reproducible
412     conf.setMergeScheduler(new SerialMergeScheduler());
413     conf.setCodec(getCodec());
414     
415     int numDocs = atLeast(500);
416     
417     IndexWriter iw = new IndexWriter(dir, conf);
418     try {
419       boolean allowAlreadyClosed = false;
420       for (int i = 0; i < numDocs; i++) {
421         dir.setRandomIOExceptionRateOnOpen(0.02); // turn on exceptions for openInput/createOutput
422         
423         Document doc = new Document();
424         doc.add(newStringField("id", Integer.toString(i), Field.Store.NO));
425         addRandomFields(doc);
426         
427         // single doc
428         try {
429           iw.addDocument(doc);
430           // we made it, sometimes delete our doc
431           iw.deleteDocuments(new Term("id", Integer.toString(i)));
432         } catch (AlreadyClosedException ace) {
433           // OK: writer was closed by abort; we just reopen now:
434           dir.setRandomIOExceptionRateOnOpen(0.0); // disable exceptions on openInput until next iteration
435           assertTrue(iw.deleter.isClosed());
436           assertTrue(allowAlreadyClosed);
437           allowAlreadyClosed = false;
438           conf = newIndexWriterConfig(analyzer);
439           // just for now, try to keep this test reproducible
440           conf.setMergeScheduler(new SerialMergeScheduler());
441           conf.setCodec(getCodec());
442           iw = new IndexWriter(dir, conf);            
443         } catch (Exception e) {
444           if (e.getMessage() != null && e.getMessage().startsWith("a random IOException")) {
445             exceptionStream.println("\nTEST: got expected fake exc:" + e.getMessage());
446             e.printStackTrace(exceptionStream);
447             allowAlreadyClosed = true;
448           } else {
449             Rethrow.rethrow(e);
450           }
451         }
452         
453         if (random().nextInt(10) == 0) {
454           // trigger flush:
455           try {
456             if (random().nextBoolean()) {
457               DirectoryReader ir = null;
458               try {
459                 ir = DirectoryReader.open(iw, random().nextBoolean());
460                 dir.setRandomIOExceptionRateOnOpen(0.0); // disable exceptions on openInput until next iteration
461                 TestUtil.checkReader(ir);
462               } finally {
463                 IOUtils.closeWhileHandlingException(ir);
464               }
465             } else {
466               dir.setRandomIOExceptionRateOnOpen(0.0); // disable exceptions on openInput until next iteration: 
467                                                        // or we make slowExists angry and trip a scarier assert!
468               iw.commit();
469             }
470             if (DirectoryReader.indexExists(dir)) {
471               TestUtil.checkIndex(dir);
472             }
473           } catch (AlreadyClosedException ace) {
474             // OK: writer was closed by abort; we just reopen now:
475             dir.setRandomIOExceptionRateOnOpen(0.0); // disable exceptions on openInput until next iteration
476             assertTrue(iw.deleter.isClosed());
477             assertTrue(allowAlreadyClosed);
478             allowAlreadyClosed = false;
479             conf = newIndexWriterConfig(analyzer);
480             // just for now, try to keep this test reproducible
481             conf.setMergeScheduler(new SerialMergeScheduler());
482             conf.setCodec(getCodec());
483             iw = new IndexWriter(dir, conf);            
484           } catch (Exception e) {
485             if (e.getMessage() != null && e.getMessage().startsWith("a random IOException")) {
486               exceptionStream.println("\nTEST: got expected fake exc:" + e.getMessage());
487               e.printStackTrace(exceptionStream);
488               allowAlreadyClosed = true;
489             } else {
490               Rethrow.rethrow(e);
491             }
492           }
493         }
494       }
495       
496       try {
497         dir.setRandomIOExceptionRateOnOpen(0.0); // disable exceptions on openInput until next iteration: 
498                                                  // or we make slowExists angry and trip a scarier assert!
499         iw.close();
500       } catch (Exception e) {
501         if (e.getMessage() != null && e.getMessage().startsWith("a random IOException")) {
502           exceptionStream.println("\nTEST: got expected fake exc:" + e.getMessage());
503           e.printStackTrace(exceptionStream);
504           try {
505             iw.rollback();
506           } catch (Throwable t) {}
507         } else {
508           Rethrow.rethrow(e);
509         }
510       }
511       dir.close();
512     } catch (Throwable t) {
513       System.out.println("Unexpected exception: dumping fake-exception-log:...");
514       exceptionStream.flush();
515       System.out.println(exceptionLog.toString("UTF-8"));
516       System.out.flush();
517       Rethrow.rethrow(t);
518     }
519     
520     if (VERBOSE) {
521       System.out.println("TEST PASSED: dumping fake-exception-log:...");
522       System.out.println(exceptionLog.toString("UTF-8"));
523     }
524   }
525 }